Sys.time()
rm(list = ls())
setwd([PATH])
source("0_Initiation_Programs.R",encoding = "UTF-8") ; gc()
  cl<-makeCluster([CORES],type="SOCK", outfile="")
  registerDoSNOW(cl)

Convert_to_Rda<-F #Only run once, at the beginning
Define_STEM<-T
Clean_Initial_App_Data<-T
Combine_App_Data<-T
Construct_SATII_Data<-T
Construct_NSC_Data<-T
Construct_EDD_Data<-T
Construct_CA_HS_Graduation_Data<-T
  Generate_NSC_Link_File<-F #Only run once, at the beginning


Construct_CollegeBoard_Data<-T
Construct_Value_Added_Data<-T
Construct_ACS_Data<-T

  
  
  
  
  
if(Convert_to_Rda){
  
  Data<-read.sas7bdat(paste0(secure,"uadm_data_extract_zach_bleemer_3.sas7bdat"))
  for(v in names(Data)){
    Data[,v]<-as.character(Data[,v])
    if(!is.na(as.numeric(Data[,v])[1])) Data[,v]<-as.numeric(Data[,v])
  }
  save(Data,file=paste0(secure,"uadm_data_extract_3.Rda"))
  
  U<-read.sas7bdat(paste0(secure,"ulong_phase2_allyearsp92_15_0818.sas7bdat"))
  for(v in names(U)){
    U[,v]<-as.character(U[,v])
    if(!is.na(as.integer(U[,v])[1])) U[,v]<-as.integer(U[,v])
  }
  save(U,file=paste0(secure,"ulong_phase2_allyearsp92_15_0818.Rda"))
  
  
  for(y in 1994:2018){
    SAT<-read_sas(paste0(secure_cb,"SAS/cb",y,".sas7bdat"))
    save(SAT,file=paste0(secure_cb,"SAS/cb",y,".R"))
  }
  
  EDD<-read_csv(paste0(secure,"AEO_EDD_INBOUND_DATA_201807181143.csv"))
  save(EDD,file=paste0(secure,"AEO_EDD_INBOUND_DATA_201807181143.Rda"))
}




if(Define_STEM){
  ###
  #  Produces list of CIP codes used to define STEM majors
  ###
  load(paste0(secure_nsc,"Six_Year_Grad/Full_NSC_Combined.Rda"))
  
  stem<-unlist(read.csv("Data/Raw/Definition of STEM.csv",stringsAsFactors = F)) #Copy-pasted from 2016 DHS document defining STEM majors by CIP codes (Data/Raw/Definition of STEM.pdf)
    stem<-gsub("^.*[^0-9]([0-9]{6})[^0-9].*$","\\1",gsub("[.]","",stem[grepl("[.]",stem)]))
    stem[nchar(stem)==5]<-paste0("0",stem[nchar(stem)==5])
    for(z in stem[grepl("X",stem)]) stem<-c(stem,paste0(gsub("X","",z),addzeros(1:9999,4)))
    temp<-aggregate(Degree.CIP.1~Degree.Major.1,Full_NSC[!is.na(Full_NSC$Degree.CIP.1),],Mode)
    STEM<-temp$Degree.Major.1[temp$Degree.CIP.1%in%as.integer(stem)[!is.na(as.integer(stem))]]
    STEM<-STEM[!duplicated(STEM)]
  save(STEM,file="Data/Derived/NSC_STEM_fromCIP.Rda")
}







if(Clean_Initial_App_Data){
  load(paste0(secure,"uadm_data_extract_3.Rda"))

  ###
  #  Create unique ID's for matriculants
  ###
  Data$Campus_ID<-as.integer(substr(Data$LKEY,1,2))
  Data$ID<-gsub("\\s","",gsub("^..(.*$)","\\1",Data$LKEY))
    Data$ID[NA_to_F(Data$Campus_ID==1)]<-gsub("^[0-9]{3}(.*)$","\\1",Data$ID[NA_to_F(Data$Campus_ID==1)]) #UC Berkeley reports an extra numeral at the end of their IDs
    Data<-rbind(Data[Data$LKEY=="",],Data[!duplicated(Data[,c("Campus_ID","ID")])&Data$LKEY!="",])
  
  ###
  #  Process Names
  ###
  Data$FName<-titleCase(gsub("^[^,]*,\\s*|[ ,].*$","",Data$NAME))
  hi<-grepl("^[^,]*,\\s*[A-z]+[, ]+",Data$NAME) ; Data$MName[hi]<-titleCase(gsub("^[^,]*,\\s*[A-z]+[, ]+","",Data$NAME[hi]))
  Data$LName<-titleCase(gsub(",.*$","",Data$NAME))
  Data<-Data[,!names(Data)=="NAME"]
  
  ###
  #  Process prospective majors
  ###
  Majors<-read.csv("Data/Derived/Major_Codes_Real_Categorized.csv",stringsAsFactors = F) #Categorization of UC major codes
    Majors<-Majors[NA_to_F(Majors$Major!=""),]
    for(zzz in 1:2) Majors$MAJOR_CODE[nchar(Majors$MAJOR_CODE)<3]<-paste0("0",Majors$MAJOR_CODE[nchar(Majors$MAJOR_CODE)<3])
    Majors$STEM[Majors$Major%in%c("Biochemistry","Bioengineering","Biology","Biophysics","Chemical Engineering","Chemistry","Civil Engineering","Cognitive Science","Computer Engineering","Computer Science","Computer Science Engineering","Electrical Engineering","Environmental Studies","Geology","Industrial Engineering","Information","Materials Science","Mathematics","Mechanical Engineering","Neuroscience","Nutrition","Other Engineering","Other Natural Sciences","Physics","Statistics","Veterinary Medicine","Math Economics")]<-1 ; Majors$STEM[is.na(Majors$STEM)&!is.na(Majors$Major)]<-0 ; Majors<-rename(Majors,M_STEM=STEM)
    Majors$M_Art<-Majors$Major%in%c("Art","Creative Writing","Music","Theater") ; Majors$M_Art[is.na(Majors$M_Art)&!is.na(Majors$Major)]<-0
    Majors$M_Hum<-Majors$Major%in%c("Art History","Chinese","Classics","English","Film","French","German","History","Japanese","Other Humanities","Other Languages","Philosophy","Religion","Russian","Spanish") ; Majors$M_Hum[is.na(Majors$M_Hum)&!is.na(Majors$Major)]<-0
    Majors$M_SS<-Majors$Major%in%c("Anthropology","Ethnic Studies","Geography","International Studies","Law","Linguistics","Other Social Sciences","Political Science","Psychology","Sociology","Urban Planning","Women's Studies") ; Majors$M_SS[is.na(Majors$M_SS)&!is.na(Majors$Major)]<-0
    Majors$M_Sci<-Majors$Major%in%c("Biochemistry","Biology","Biophysics","Chemistry","Cognitive Science","Environmental Studies","Geology","Mathematics","Neuroscience","Other Natural Sciences","Physics","Statistics") ; Majors$M_Sci[is.na(Majors$M_Sci)&!is.na(Majors$Major)]<-0
    Majors$M_Eng<-Majors$Major%in%c("Bioengineering","Chemical Engineering","Civil Engineering","Computer Engineering","Computer Science","Computer Science Engineering","Electrical Engineering","Industrial Engineering","Information","Materials Science","Mechanical Engineering","Other Engineering") ; Majors$M_Eng[is.na(Majors$M_Eng)&!is.na(Majors$Major)]<-0
    Majors$M_Prof<-Majors$Major%in%c("Agriculture","Architecture","Communications","Criminology","Design","Education","Journalism","Kinesiology","Nursing","Nutrition","Other Health Sciences","Other Professional","Public Health","Public Policy","Social Welfare","Speech Pathology","Veterinary Medicine") ; Majors$M_Prof[is.na(Majors$M_Prof)&!is.na(Majors$Major)]<-0
    Majors$M_Bus<-Majors$Major%in%c("Accounting","Business","Economics","Finance","Marketing","Math Economics") ; Majors$M_Bus[is.na(Majors$M_Bus)&!is.na(Majors$Major)]<-0
  
  #Categorize reported prospective major for each UC campus into a discipline
  Data$Pros_Major<-rep(NA,nrow(Data))
  for(v in c("STEM","Art","Hum","SS","Sci","Eng","Prof","Bus","Sum")){
    Data[,paste0("Pros_Major_",v)]<-0
  }
  for(i in c("01","03","04","05","06","07","08","09","10")){ #Loop over campuses
    Majors[,paste0("Major",i)]<-Majors$Major
    Data<-merge(Data,Majors[,grepl(paste0("MAJOR_CODE|Major",i,"|M_"),names(Majors))],by.x=paste0("CMAJPRP",i),by.y="MAJOR_CODE",all.x=T)
    Data[NA_to_F(Data[,paste0("ENR",i)]==1),"Pros_Major"]<-Data[NA_to_F(Data[,paste0("ENR",i)]==1),paste0("Major",i)]
    for(v in c("STEM","Art","Hum","SS","Sci","Eng","Prof","Bus")){
      Data[NA_to_F(Data[,paste0("M_",v)]==1),paste0("Pros_Major_",v)]<-Data[NA_to_F(Data[,paste0("M_",v)]==1),paste0("Pros_Major_",v)]+1
      Data<-Data[,names(Data)!=paste0("M_",v)]
    }
    Data$Pros_Major_Sum[!is.na(Data[,paste0("Major",i)])]<-Data$Pros_Major_Sum[!is.na(Data[,paste0("Major",i)])]+1 #Only count if matches a major (no undeclared, double-major, missing from major data)
  }
  for(v in c("STEM","Art","Hum","SS","Sci","Eng","Prof","Bus")){ #Each variable is the % of campuses that applied to in that general area.
    Data[,paste0("Pros_Major_",v)]<-Data[,paste0("Pros_Major_",v)]/Data$Pros_Major_Sum
  }
  #Identify most common prospective major discipline
  Data$Pros_Major_Type<-!is.na(Data$Pros_Major)
  Data<-merge(Data,ddply(.data=Data[Data$Pros_Major_Type==0,c("Campus_ID","ID","Major01","Major03","Major04","Major05","Major06","Major07","Major08","Major09","Major10")],.variables=c("Campus_ID","ID"),function(x) Mode(unlist(x[,-c(1,2)])[!is.na(unlist(x[,-c(1,2)]))])),by=c("Campus_ID","ID"),all.x=T)
  Data$Pros_Major[is.na(Data$Pros_Major)]<-Data$V1[is.na(Data$Pros_Major)]
  Data$Pros_Major_Type[Data$Pros_Major_Type==0 & !is.na(Data$Pros_Major)]<-2
  Data<-Data[,names(Data)!="V1"]
  
  ###
  #  Process Dates
  ###
  Data$BIRTHDATE<-as.character(Data$BIRTHDATE)
  for(zzz in 1:3) Data$BIRTHDATE[NA_to_F(nchar(Data$BIRTHDATE)<6)]<-paste0("0",Data$BIRTHDATE[NA_to_F(nchar(Data$BIRTHDATE)<6)])
  Data$BYear<-1900+as.integer(substr(Data$BIRTHDATE,1,2)) ; Data$BYear[NA_to_F(Data$BYear<1920)]<-Data$BYear[NA_to_F(Data$BYear<1920)]+100
  Data$BMonth<-as.integer(substr(Data$BIRTHDATE,3,4))
  Data$BDay<-as.integer(substr(Data$BIRTHDATE,5,6))
  Data<-Data[,!names(Data)=="BIRTHDATE"]
  
  for(n in names(Data)[grepl("ADM",names(Data))]){
    Data[,n]<-as.integer(Data[,n])
  }
  for(n in c("hsgpa","trgpa")){
    Data[,n]<-as.numeric(Data[,n])
  }
  
  ###
  #  Process test scores
  ###
  Data$SAT_R<-as.integer(Data$SATRR) ; Data$SAT_R[is.na(Data$SAT_R)]<-as.integer(Data$SATIV[is.na(Data$SAT_R)])
  Data$SAT_M<-as.integer(Data$SATRM) ; Data$SAT_M[is.na(Data$SAT_M)]<-as.integer(Data$SATIM[is.na(Data$SAT_M)])
  Data$SAT_W<-as.integer(Data$SATRW)
  Data$SAT<-Data$SAT_R+Data$SAT_M
  Data<-Data[,!names(Data)%in%c("SATI")]
  Data$ACTCMP[Data$ACTCMP=="NaN"]<-Data$ACT[Data$ACTCMP=="NaN"]
  Data<-Data[,!names(Data)%in%c("ACT")]
  Data<-rename(Data,ACT=ACTCMP,ACT_Eng=ACTENG,ACT_M=ACTMTH,ACT_Read=ACTREA,ACT_Sci=ACTSCI,ACT_CompWrit=ACTEWW)
  for(v in c("ACT","ACT_Eng","ACT_M","ACT_Read","ACT_Sci","ACT_CompWrit")){
    Data[,v]<-as.integer(Data[,v])
  }
  for(v in c("SAT_R","SAT_M","SAT_W","SAT","satact")){
    Data[,v]<-as.integer(round(as.numeric(Data[,v])/10)*10)
  }
  
  ###
  #  Combine home address information
  ###
  Data$PERM_CITY[Data$PERM_CITY==""]<-Data$ADDR_PERM_CITY[Data$PERM_CITY==""]
  Data$PERM_STATE[Data$PERM_STATE==""]<-Data$ADDR_PERM_STATE[Data$PERM_STATE==""]
  Data$PERM_STREET[Data$PERM_STREET==""]<-Data$ADDR_PERM_STREET[Data$PERM_STREET==""]
  Data$PERM_ZIP[is.na(Data$PERM_ZIP)]<-Data$ADDR_PERM_ZIP_POSTAL[is.na(Data$PERM_ZIP)]
  Data$PERM_COUNTRY[Data$PERM_COUNTRY==""]<-Data$ADDR_PERM_COUNTRY[Data$PERM_COUNTRY==""]
  Data<-Data[,!(names(Data)%in%c("ADDR_PERM_STREET","ADDR_PERM_CITY","ADDR_PERM_STATE","ADDR_PERM_ZIP_POSTAL","ADDR_PERM_COUNTRY","Permanent_Street2"))]
  
  
  ###
  #  Process high school information
  ###
  #Pull data for HS cross-walk
  #NOTE: First save CPREVSCH and CSRCNAME information; includes out-of-state high schools and community colleges that aren't present in HS database
  HS<-read.csv("Data/Derived/CA_HS_CrossWalk.csv") #Crosswalk across high school codes
    for(n in c("County","District","School")) HS[,paste0("HS_",n)]<-HS[,n]
    HS$HS_Name<-HS$SCHOOL_NAME
    HS$HS_SDE_Code<-HS$SDE_CODE
  Data<-merge(Data,HS[,grepl("HS_|CPREVSCH",names(HS))],by="CPREVSCH",all.x=T,sort=F)
    Data<-Data[,!names(Data)%in%c("CSRCNAME")]
  
  
  save(Data,file=paste0(secure_derived,"AppData_Intermediate.Rda"))
  gc()
}







if(Construct_SATII_Data){ #Cleans UC SATII data for merge into applicant database
  load(paste0(secure,"TEST_F_201811071114.Rda"))
  Test_Catalog<-read_csv(paste0(secure,"TEST_CATALOG_D_201811021807.csv"))
  Info<-read_csv(paste0(secure,"TEST_INFO_D_201811141440.csv"))
  
  Term<-read.csv(paste0(secure,"TERM_D_201712061350.csv"))
    names(Term)[1]<-"T_KEY" ; Term<-rename(Term,YEARAPAY=T_ACAD_YR,CQUARTERAP=T_CD)
    Term$YEARAPAY<-as.numeric(gsub(",","",as.character(Term$YEARAPAY)))
  Tests<-merge(Tests,Term[,c("T_KEY","YEARAPAY","CQUARTERAP")],by="T_KEY",all.x=T)
  
  SATII<-Tests[Tests$TST_CATLG_KEY%in%Test_Catalog$TST_CATLG_KEY[grepl("SATII|Subject",Test_Catalog$TST_CATLG_M_TST_CATG_CD)],]
  SATII<-merge(SATII,Info[,c("TST_INFO_KEY","TST_INFO_TST_TY_NAM")])
  SATII<-merge(SATII,Test_Catalog[,c("TST_CATLG_KEY","TST_CATLG_M_TST_CD","TST_CATLG_ETS_BUS_NAM")]) ; gc()
  
  SATII<-SATII[!grepl("Projected|Derived",SATII$TST_CATLG_ETS_BUS_NAM),] #Duplicated or non-scores
  SATII<-SATII[!(grepl("^SATII Math[12] Score-Best",SATII$TST_CATLG_ETS_BUS_NAM)&SATII$YEARAPAY%in%c(2000,2001)),] #Duplicated scores
  

  SATII$TST_CATLG_M_TST_CD<-gsub("^SAT(II)?","",SATII$TST_CATLG_M_TST_CD)
  SATII<-SATII[,c("AL_KEY","TST_CATLG_M_TST_CD","TST_INFO_TST_TY_NAM","TST_SCO","YEARAPAY")]
    names(SATII)<-c("AL_KEY","SATII","SATII_Type","SATII_Score","YEARAPAY") ; gc()
  SATII<-SATII[!SATII$SATII%in%c("Math"),] #Duplicated scores
  SATII$SATII[SATII$SATII%in%c("WritingW","WritingUnk","WritingEnglishCompositionA","WritingEnglishCompositionwithEssayV")]<-"Writing" ; SATII$SATII[SATII$SATII%in%c("AmericanHistoryAndSocialStudies","UsHistory")]<-"USHistory" ; SATII$SATII[SATII$SATII=="Math2Calculator"]<-"Math2" ; SATII$SATII[SATII$SATII%in%c("ModernHebrew","HebrewModern")]<-"Hebrew" ; SATII$SATII[SATII$SATII=="HistoryWorld"]<-"WorldHistory" ; SATII$SATII<-gsub("(With)?Listening","",SATII$SATII) ; SATII$SATII[SATII$SATII=="Biology"]<-"EcologicalBiology"
  SATII<-SATII[!duplicated(SATII),]
  
  SATII<-SATII[order(SATII$AL_KEY,SATII$SATII,SATII$SATII_Score,decreasing=T),]
  SATII<-SATII[!duplicated(SATII[,c("AL_KEY","SATII")]),c("AL_KEY","SATII","SATII_Score")] #Take the highest observed score for each test
  SATII<-SATII[order(SATII$AL_KEY,SATII$SATII_Score),]
  SATII$Time[SATII$SATII=="Writing"]<-100
  SATII$Time[SATII$SATII=="Math1"]<-101
  SATII$Time[SATII$SATII=="Math2"]<-102
  SATII<-ddply(SATII,.(AL_KEY),function(x){
    x$Time[is.na(x$Time)]<-1:sum(is.na(x$Time))
    return(x)
  })
  SATII<-reshape(SATII,v.names=c("SATII","SATII_Score"),idvar=c("AL_KEY"),timevar="Time",direction="wide")
  SATII<-rename(SATII,SATII_W=SATII_Score.100,SATII_M1=SATII_Score.101,SATII_M2=SATII_Score.102)
  SATII<-SATII[,!names(SATII)%in%c("SATII.100","SATII.101","SATII.102")]
  names(SATII)<-gsub("[.]","_",names(SATII))
  
  save(SATII,file=paste0(secure,"SATII_Data.Rda"))
}



###
#  Construct NSC data
###
if(Construct_NSC_Data){
  load(paste0(secure_derived,"AppData_Intermediate.Rda"))
  
  # Produce submission file for National Student Clearinghouse link
  if(Generate_NSC_Link_File){ #Only ran once
    Data<-Data[Data$YEARAPAY>1994&Data$YEARAPAY<2017&!is.na(Data$BMonth),]
    Data$bmonth<-as.character(Data$BMonth) ; Data$bmonth[Data$BMonth<10]<-paste0("0",Data$BMonth[Data$BMonth<10])
    Data$BYear[Data$BYear>2005]<-Data$BYear[Data$BYear>2005]-100
    Data$bday<-as.character(Data$BDay) ; Data$bday[Data$BDay<10]<-paste0("0",Data$BDay[Data$BDay<10])
    Data$BirthDate<-paste0(Data$BYear,Data$bmonth,Data$bday)
    Data$MInit<-substr(Data$MName,1,1) ; Data$MInit[is.na(Data$MInit)]<-""
    Data$BeginDate<-paste0(Data$YEARAPAY,"0101")
    Data$SchCode<-""
    Data$SchCode[NA_to_F(Data$APP01==1)&Data$SchCode==""]<-"001312"
    Data$SchCode[NA_to_F(Data$APP10==1)&Data$SchCode==""]<-"041271"
    Data$SchCode[NA_to_F(Data$APP03==1)&Data$SchCode==""]<-"001313"
    Data$SchCode[NA_to_F(Data$APP04==1)&Data$SchCode==""]<-"001315"
    Data$SchCode[NA_to_F(Data$APP05==1)&Data$SchCode==""]<-"001316"
    Data$SchCode[NA_to_F(Data$APP06==1)&Data$SchCode==""]<-"001317"
    Data$SchCode[NA_to_F(Data$APP07==1)&Data$SchCode==""]<-"001321"
    Data$SchCode[NA_to_F(Data$APP08==1)&Data$SchCode==""]<-"001320"
    Data$SchCode[NA_to_F(Data$APP09==1)&Data$SchCode==""]<-"001314"
    Data$BranchCode<-"00"
    Data$Detail<-"D1"
    Data$SSN<-""
    Data$Suffix<-""
    Data$Blank<-""
    Data$ReturnField<-as.character(seq(1,nrow(Data)))
    Data<-Data[,c("Detail","SSN","FName","MInit","LName","Suffix","BirthDate","BeginDate","Blank","SchCode","BranchCode","ReturnField")]
    Data$LName<-substr(Data$LName,1,20) ; Data$FName<-substr(Data$FName,1,20)
    Data<-Data[NA_to_F(!Data$LName%in%c("","Nln","Error"))&NA_to_F(!Data$FName%in%c("","Nfn","Error"))&grepl("[A-z]",Data$FName)&grepl("[A-z]",Data$LName),]
    Data$MInit[!Data$MInit%in%LETTERS]<-""
    temp<-data.frame("H1","999111","","UNIV OF CA SYS (PRESIDENT'S OFFICE)","20180209","DA","S","","","","","",stringsAsFactors = F) ; names(temp)<-c("Detail","SSN","FName","MInit","LName","Suffix","BirthDate","BeginDate","Blank","SchCode","BranchCode","ReturnField") ; Data<-rbind(temp,Data)
    temp<-data.frame("T1",as.character(nrow(Data)+1),"","","","","","","","","","",stringsAsFactors = F) ; names(temp)<-c("Detail","SSN","FName","MInit","LName","Suffix","BirthDate","BeginDate","Blank","SchCode","BranchCode","ReturnField") ; Data<-rbind(Data,temp)
    write.table(Data,file=paste0(secure_nsc,"Six_Year_Grad/Full_NSC_Input_Mar2019.txt"),col.names=F,row.names=F,na="",quote=F,sep="\t")
    Full_NSC_Input<-Data; rm(Data)
    save(Full_NSC_Input,file=paste0(secure_nsc,"Six_Year_Grad/Full_NSC_Input_2019.Rda"))
  }
  
  #Pull in resulting NSC file
  Full_NSC<-read_csv(paste0(secure_nsc,"Six_Year_Grad/999111_357132_DETLRPT_DA_03212019051029_full_nsc_input_mar2019.csv"))
  save(Full_NSC,file=paste0(secure_nsc,"Six_Year_Grad/Full_NSC.Rda"))
  
  #Merge with input information, to merge back to applicant data
  names(Full_NSC)<-gsub("\\s",".",names(Full_NSC))
  load(paste0(secure_nsc,"Six_Year_Grad/Full_NSC_Input.Rda"))
  names(Full_NSC_Input)<-c("Detail","SSN","FName","MInit","LName","Suffix","BirthDate","BeginDate","Blank","SchCode","BranchCode","ReturnField")
  Full_NSC_Input<-Full_NSC_Input[,c("FName","MInit","LName","BirthDate","BeginDate","ReturnField")]
  Full_NSC_Input<-Full_NSC_Input[!is.na(Full_NSC_Input$BeginDate),]
  Full_NSC_Input<-Full_NSC_Input[!duplicated(Full_NSC_Input[,c("FName","MInit","LName","BirthDate","BeginDate")]),] #Remove duplicates
  gc()
  Full_NSC<-merge(Full_NSC,Full_NSC_Input,by.x="Requester.Return.Field",by.y="ReturnField")
  save(Full_NSC,file=paste0(secure_nsc,"Six_Year_Grad/Full_NSC_Combined.Rda"))
  rm(Full_NSC_Input) ; gc()
}








if(Construct_EDD_Data){
  load(paste0(secure,"AEO_EDD_INBOUND_DATA_201807181143.Rda"))
  EDD<-rename(EDD,ID=AEO_UNIQUE_IDENTIFIER)
  EDD$Year<-round(EDD$QUARTER_PERIOD/10)
  EDD$Quarter<-(EDD$QUARTER_PERIOD-(EDD$Year*10))
  EDD<-EDD[,!names(EDD)%in%c("GROUP_NUMBER","YEAR_CODE","QUARTER_PERIOD","COUNTY_CODE")] ; gc()
  
  hi<-EDD[,c("ID","Year")] ; gc()
  hi<-hi[!duplicated(hi),]
  hi$hi<-round(seq(1,nrow(hi))/10000)
  Sys.time() ; EDD<-merge(EDD,hi) ; gc()
  edd<-list()
  for(i in unique(hi$hi)){
    edd<-append(edd,list(EDD[EDD$hi==i,]))
    if(i%%10==0) print(i)
  } ; Sys.time() #Batch up into batches of 10,000
  
  process_edd<-function(EDD){
    EDD<-ddply(EDD,.(ID,Year),function(x){
      y<-x[1,c("ID","Year")]
      y$wage_sum<-sum(x$QUARTELY_SUBJECT_WAGES)
      for(q in 1:4){
        y[,paste0("wage_sum_Q",q)]<-sum(x$QUARTELY_SUBJECT_WAGES[x$Quarter==q])
      }
      return(y)
    })
    return(EDD)
  }
  
  rm(EDD) ; gc()
  Sys.time() ; EDD<-foreach(e=edd,.inorder=F,.packages = c("plyr"),.combine=rbind) %dopar% process_edd(e) ; Sys.time() #Process, looping over batches
  rm(edd) ; gc()
  
  #Now match to applicant ID number
  EDD<-rename(EDD,AEO_UNIQUE_IDENTIFIER=ID)
  ALKEY<-read.csv(paste0(secure,"AEO_Match_AL_KEY_2018_201906051035.csv"),stringsAsFactors = F)
    names(ALKEY)<-c("AEO_UNIQUE_IDENTIFIER","AL_KEY")
  EDD<-merge(EDD,ALKEY,all.x=T)
  
  EDD<-cbind(EDD[,names(EDD)%in%c("AEO_UNIQUE_IDENTIFIER","AL_KEY")],EDD[,!names(EDD)%in%c("AEO_UNIQUE_IDENTIFIER","AL_KEY")]) #Adjust column order
  
  #CPI-adjust to 2018
  CPI<-read.csv("Data/Raw/CPI_California.csv")
  EDD<-merge(EDD,CPI[,c(1:2)])
  for(v in names(EDD)[grepl("^wage_sum",names(EDD))]){
    EDD[,v]<-EDD[,v]/(EDD$CPI/CPI$CPI[CPI$Year==2018])
  }
  EDD<-EDD[,!names(EDD)=="CPI"]
  
  save(EDD,file=paste0(secure,"EDD_processed.Rda"))
}









if(Construct_CA_HS_Graduation_Data){
  #Note: This constructs a database of the annual number of graduates from each public California high school by ethnicity, conditional and unconditional on those graduates meeting UC admissions standards
  #Data source: https://www.cde.ca.gov/ds/sd/sd/filesgrads.asp
  G<-data.frame()
  for(i in c(93:99,0:17)){
    z<-"" ; if(i<10) z<-"0"
    a<-read_delim(paste0("Data/Raw/HS/grads",z,i,".txt"),delim="\t")
    a<-a[a$GRADS>0,] ; a_orig<-a
    
    if(i%in%93:97){
      #Code 1 = American Indian or Alaska Native ; Code 2 = Asian ; Code 3 = Pacific Islander ; Code 4 = Filipino ; Code 5 = Hispanic or Latino ; Code 6 = Black, not Hispanic ; Code 7 = White, not Hispanic
      a<-a[,c("CDS_CODE","ETHNIC","GENDER","UC_GRADS")]
      a<-dcast(a,CDS_CODE+ETHNIC~GENDER)
      a1<-dcast(a[,!names(a)=="F"],CDS_CODE~ETHNIC) ; names(a1)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7") ; for(v in names(a1)[grepl("^v",names(a1))]) a1[is.na(a1[,v]),v]<-0 ; a1$Asian_M<-a1$v2+a1$v4 ; a1$Filipino_M<-a1$v4 ; a1$Hispanic_M<-a1$v5 ; a1$Black_M<-a1$v6 ; a1$White_M<-a1$v7 ; a1$URM_M<-a1$v1+a1$v3+a1$v5+a1$v6 ; a1$Total_M<-a1$v1+a1$v2+a1$v3+a1$v4+a1$v5+a1$v6+a1$v7 ; a1<-a1[,!grepl("[0-9]",names(a1))]
      a2<-dcast(a[,!names(a)=="M"],CDS_CODE~ETHNIC) ; names(a2)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7") ; for(v in names(a2)[grepl("^v",names(a2))]) a2[is.na(a2[,v]),v]<-0 ; a2$Asian_F<-a2$v2+a2$v4 ; a2$Filipino_F<-a2$v4 ; a2$Hispanic_F<-a2$v5 ; a2$Black_F<-a2$v6 ; a2$White_F<-a2$v7 ; a2$URM_F<-a2$v1+a2$v3+a2$v5+a2$v6 ; a2$Total_F<-a2$v1+a2$v2+a2$v3+a2$v4+a2$v5+a2$v6+a2$v7 ; a2<-a2[,!grepl("[0-9]",names(a2))]
      a<-merge(a1,a2,all=T) ; names(a)[-1]<-paste0(names(a)[-1],"_UC") ; a_UC<-a
      
      a<-a_orig
      a<-a[,c("CDS_CODE","ETHNIC","GENDER","GRADS")]
      a<-dcast(a,CDS_CODE+ETHNIC~GENDER)
      a1<-dcast(a[,!names(a)=="F"],CDS_CODE~ETHNIC) ; names(a1)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7") ; for(v in names(a1)[grepl("^v",names(a1))]) a1[is.na(a1[,v]),v]<-0 ; a1$Asian_M<-a1$v2+a1$v4 ; a1$Filipino_M<-a1$v4 ; a1$Hispanic_M<-a1$v5 ; a1$Black_M<-a1$v6 ; a1$White_M<-a1$v7 ; a1$URM_M<-a1$v1+a1$v3+a1$v5+a1$v6 ; a1$Total_M<-a1$v1+a1$v2+a1$v3+a1$v4+a1$v5+a1$v6+a1$v7 ; a1<-a1[,!grepl("[0-9]",names(a1))]
      a2<-dcast(a[,!names(a)=="M"],CDS_CODE~ETHNIC) ; names(a2)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7") ; for(v in names(a2)[grepl("^v",names(a2))]) a2[is.na(a2[,v]),v]<-0 ; a2$Asian_F<-a2$v2+a2$v4 ; a2$Filipino_F<-a2$v4 ; a2$Hispanic_F<-a2$v5 ; a2$Black_F<-a2$v6 ; a2$White_F<-a2$v7 ; a2$URM_F<-a2$v1+a2$v3+a2$v5+a2$v6 ; a2$Total_F<-a2$v1+a2$v2+a2$v3+a2$v4+a2$v5+a2$v6+a2$v7 ; a2<-a2[,!grepl("[0-9]",names(a2))]
      a<-merge(a1,a2,all=T)
      a<-merge(a,a_UC)
    }
    if(i%in%c(98,99,0:8)){ #NOTE: The document says the switch happens in 2007, but the gender codes don't actually switch until 2009
      #Code 1 = American Indian or Alaska Native ; Code 2 = Asian ; Code 3 = Pacific Islander ; Code 4 = Filipino ; Code 5 = Hispanic or Latino ; Code 6 = African American, not Hispanic ; Code 7 = White, not Hispanic ; Code 8 = Multiple or No Response
      a<-a[,c("CDS_CODE","ETHNIC","GENDER","UC_GRADS")]
      a<-dcast(a,CDS_CODE+ETHNIC~GENDER)
      a1<-dcast(a[,!names(a)=="F"],CDS_CODE~ETHNIC) ; names(a1)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7","v8") ; for(v in names(a1)[grepl("^v",names(a1))]) a1[is.na(a1[,v]),v]<-0 ; a1$Asian_M<-a1$v2+a1$v4 ; a1$Filipino_M<-a1$v4 ; a1$Hispanic_M<-a1$v5 ; a1$Black_M<-a1$v6 ; a1$White_M<-a1$v7 ; a1$URM_M<-a1$v1+a1$v3+a1$v5+a1$v6 ; a1$Total_M<-a1$v1+a1$v2+a1$v3+a1$v4+a1$v5+a1$v6+a1$v7+a1$v8 ; a1<-a1[,!grepl("[0-9]",names(a1))]
      a2<-dcast(a[,!names(a)=="M"],CDS_CODE~ETHNIC) ; names(a2)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7","v8") ; for(v in names(a2)[grepl("^v",names(a2))]) a2[is.na(a2[,v]),v]<-0 ; a2$Asian_F<-a2$v2+a2$v4 ; a2$Filipino_F<-a2$v4 ; a2$Hispanic_F<-a2$v5 ; a2$Black_F<-a2$v6 ; a2$White_F<-a2$v7 ; a2$URM_F<-a2$v1+a2$v3+a2$v5+a2$v6 ; a2$Total_F<-a2$v1+a2$v2+a2$v3+a2$v4+a2$v5+a2$v6+a2$v7+a2$v8 ; a2<-a2[,!grepl("[0-9]",names(a2))]
      a<-merge(a1,a2,all=T) ; names(a)[-1]<-paste0(names(a)[-1],"_UC") ; a_UC<-a
      
      a<-a_orig
      a<-a[,c("CDS_CODE","ETHNIC","GENDER","GRADS")]
      a<-dcast(a,CDS_CODE+ETHNIC~GENDER)
      a1<-dcast(a[,!names(a)=="F"],CDS_CODE~ETHNIC) ; names(a1)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7","v8") ; for(v in names(a1)[grepl("^v",names(a1))]) a1[is.na(a1[,v]),v]<-0 ; a1$Asian_M<-a1$v2+a1$v4 ; a1$Filipino_M<-a1$v4 ; a1$Hispanic_M<-a1$v5 ; a1$Black_M<-a1$v6 ; a1$White_M<-a1$v7 ; a1$URM_M<-a1$v1+a1$v3+a1$v5+a1$v6 ; a1$Total_M<-a1$v1+a1$v2+a1$v3+a1$v4+a1$v5+a1$v6+a1$v7+a1$v8 ; a1<-a1[,!grepl("[0-9]",names(a1))]
      a2<-dcast(a[,!names(a)=="M"],CDS_CODE~ETHNIC) ; names(a2)<-c("CDS_CODE","v1","v2","v3","v4","v5","v6","v7","v8") ; for(v in names(a2)[grepl("^v",names(a2))]) a2[is.na(a2[,v]),v]<-0 ; a2$Asian_F<-a2$v2+a2$v4 ; a2$Filipino_F<-a2$v4 ; a2$Hispanic_F<-a2$v5 ; a2$Black_F<-a2$v6 ; a2$White_F<-a2$v7 ; a2$URM_F<-a2$v1+a2$v3+a2$v5+a2$v6 ; a2$Total_F<-a2$v1+a2$v2+a2$v3+a2$v4+a2$v5+a2$v6+a2$v7+a2$v8 ; a2<-a2[,!grepl("[0-9]",names(a2))]
      a<-merge(a1,a2,all=T)
      a<-merge(a,a_UC)
    }
    if(i%in%9:17){
      #Code 0 = Not reported ; Code 1 = American Indian or Alaska Native, Not Hispanic ; Code 2 = Asian, Not Hispanic ; Code 3 = Pacific Islander, Not Hispanic ; Code 4 = Filipino, Not Hispanic ; Code 5 = Hispanic or Latino ; Code 6 = African American, not Hispanic ; Code 7 = White, not Hispanic ; Code 9 = Two or More Races, Not Hispanic
      a<-a[,c("CDS_CODE","ETHNIC","GENDER","UC_GRADS")]
      a<-dcast(a,CDS_CODE+ETHNIC~GENDER)
      a1<-dcast(a[,!names(a)=="F"],CDS_CODE~ETHNIC) ; names(a1)<-c("CDS_CODE","v0","v1","v2","v3","v4","v5","v6","v7","v9") ; for(v in names(a1)[grepl("^v",names(a1))]) a1[is.na(a1[,v]),v]<-0 ; a1$Asian_M<-a1$v2+a1$v4 ; a1$Filipino_M<-a1$v4 ; a1$Hispanic_M<-a1$v5 ; a1$Black_M<-a1$v6 ; a1$White_M<-a1$v7 ; a1$URM_M<-a1$v1+a1$v3+a1$v5+a1$v6 ; a1$Total_M<-a1$v0+a1$v1+a1$v2+a1$v3+a1$v4+a1$v5+a1$v6+a1$v7+a1$v9 ; a1<-a1[,!grepl("[0-9]",names(a1))]
      a2<-dcast(a[,!names(a)=="M"],CDS_CODE~ETHNIC) ; names(a2)<-c("CDS_CODE","v0","v1","v2","v3","v4","v5","v6","v7","v9") ; for(v in names(a2)[grepl("^v",names(a2))]) a2[is.na(a2[,v]),v]<-0 ; a2$Asian_F<-a2$v2+a2$v4 ; a2$Filipino_F<-a2$v4 ; a2$Hispanic_F<-a2$v5 ; a2$Black_F<-a2$v6 ; a2$White_F<-a2$v7 ; a2$URM_F<-a2$v1+a2$v3+a2$v5+a2$v6 ; a2$Total_F<-a2$v0+a2$v1+a2$v2+a2$v3+a2$v4+a2$v5+a2$v6+a2$v7+a2$v9 ; a2<-a2[,!grepl("[0-9]",names(a2))]
      a<-merge(a1,a2,all=T) ; names(a)[-1]<-paste0(names(a)[-1],"_UC") ; a_UC<-a
      
      a<-a_orig
      a<-a[,c("CDS_CODE","ETHNIC","GENDER","GRADS")]
      a<-dcast(a,CDS_CODE+ETHNIC~GENDER)
      a1<-dcast(a[,!names(a)=="F"],CDS_CODE~ETHNIC) ; names(a1)<-c("CDS_CODE","v0","v1","v2","v3","v4","v5","v6","v7","v9") ; for(v in names(a1)[grepl("^v",names(a1))]) a1[is.na(a1[,v]),v]<-0 ; a1$Asian_M<-a1$v2+a1$v4 ; a1$Filipino_M<-a1$v4 ; a1$Hispanic_M<-a1$v5 ; a1$Black_M<-a1$v6 ; a1$White_M<-a1$v7 ; a1$URM_M<-a1$v1+a1$v3+a1$v5+a1$v6 ; a1$Total_M<-a1$v0+a1$v1+a1$v2+a1$v3+a1$v4+a1$v5+a1$v6+a1$v7+a1$v9 ; a1<-a1[,!grepl("[0-9]",names(a1))]
      a2<-dcast(a[,!names(a)=="M"],CDS_CODE~ETHNIC) ; names(a2)<-c("CDS_CODE","v0","v1","v2","v3","v4","v5","v6","v7","v9") ; for(v in names(a2)[grepl("^v",names(a2))]) a2[is.na(a2[,v]),v]<-0 ; a2$Asian_F<-a2$v2+a2$v4 ; a2$Filipino_F<-a2$v4 ; a2$Hispanic_F<-a2$v5 ; a2$Black_F<-a2$v6 ; a2$White_F<-a2$v7 ; a2$URM_F<-a2$v1+a2$v3+a2$v5+a2$v6 ; a2$Total_F<-a2$v0+a2$v1+a2$v2+a2$v3+a2$v4+a2$v5+a2$v6+a2$v7+a2$v9 ; a2<-a2[,!grepl("[0-9]",names(a2))]
      a<-merge(a1,a2,all=T)
      a<-merge(a,a_UC)
    }
    a$Year<-1900+i+100*(i<50)
    G<-rbind(G,a)
  }
  write.csv(G,file="Data/Derived/Graduation.csv")
}








if(Combine_App_Data){
  load(paste0(secure_derived,"AppData_Intermediate.Rda"))
  Data<-Data[inrange(Data$YEARAPAY,1994,2005),] #Restrict to 1994-2005 applications
  
  ###
  #  Merge application and National Student Clearinghouse data
  ###
  load(paste0(secure_nsc,"Six_Year_Grad/Full_NSC_Combined.Rda"))
    Full_NSC$YEARAPAY<-as.integer(substr(as.character(Full_NSC$BeginDate),1,4))
    Full_NSC$BYear<-as.integer(substr(Full_NSC$BirthDate,1,4))
    Full_NSC$BMonth<-as.integer(substr(Full_NSC$BirthDate,5,6))
    Full_NSC$BDay<-as.integer(substr(Full_NSC$BirthDate,7,8))
  Data$MInit<-substr(Data$MName,1,1) ; Data$MInit[is.na(Data$MInit)]<-""
  
  #Start with four-year enrollment in following semester
  Data_new<-data.frame()
  for(i in as.integer(names(table(Data$YEARAPAY)[inrange(names(table(Data$YEARAPAY)),1997,2005)]))){ #Start by using the UC NSC files pulled the following January (starting in 1997 and unavailable for 1994-1996). Has the advantage of catching the tiny fraction of students who haven't yet masked their NSC files
    file<-list.files(paste0(secure_nscannual,i),full.names = T)
    file<-file[order(sapply(file,file.size),decreasing=T)][1] #The largest file holds the data
    nsc<-read.csv(file)
    nsc$ETSNO<-as.numeric(substr(nsc$Requester.Return.Field,14,20))
    
    enrollment<-nsc[NA_to_F(inrange(nsc$Enrollment.Begin,i*10000+701,((i+1)*10000+9999))),] #Enrollment from Jan 1 senior year to the end of the following year (allowing gap year); last semester at community college counts as enrolling in post-secondary, but only at a two-year
    enrollment$`X2.year...4.year`<-as.integer(as.character(enrollment$`X2.year...4.year`)) ; enrollment<-enrollment[!is.na(enrollment$`X2.year...4.year`),] #tiny number of people with 'L' enrollment, not 2- or 4-year. e.g. 3 in 2005.
    enrollment<-enrollment[order(enrollment$ETSNO,-enrollment$`X2.year...4.year`),] #Keep four-year enrollment if present
    enrollment<-enrollment[!duplicated(enrollment$ETSNO),] #Unique on ID
    enrollment<-rename(enrollment,NSC_Enr_College_Name=College.Name,NSC_Enr_College_State=College.State,NSC_Enr_College_Code=`College.Code.Branch`)
    enrollment$NSC_Enr_College_Code<-as.character(enrollment$NSC_Enr_College_Code)
    enrollment$NSC_Enr_FourYear<-enrollment$`X2.year...4.year`==4
    enrollment$NSC_Enr_Public<-as.character(enrollment$`Public...Private`)=="Public"
    enrollment<-enrollment[,grepl("NSC|ETS",names(enrollment))]
    
    Data_new<-rbind.fill(Data_new,merge(Data[Data$YEARAPAY==i,],enrollment,by=c("ETSNO"),all.x=T))
    print(i)
  }
  rm(nsc,enrollment) ; gc()
  Data<-rbind.fill(Data_new,Data[Data$YEARAPAY<1997,])
  rm(Data_new)
  
  #Now augment using the full March 2019 pull; still four-year only
  enrollment<-Full_NSC[NA_to_F(Full_NSC$`2-year./.4-year`==4),c("Requester.Return.Field","FName","LName","BYear","BMonth","BDay","YEARAPAY","Enrollment.Begin","College.Name","College.State","College.Code/Branch")]
  enrollment<-enrollment[(enrollment$Enrollment.Begin>enrollment$YEARAPAY*10000+701)&(enrollment$Enrollment.Begin<(enrollment$YEARAPAY+6)*10000+701),]
  enrollment<-enrollment[order(enrollment$Enrollment.Begin),] #Keep four-year enrollment if present
  enrollment<-enrollment[!duplicated(enrollment$Requester.Return.Field),] #Unique on ID
  enrollment<-rename(enrollment,NSC_EnrFirst_College_Name=College.Name,NSC_EnrFirst_College_State=College.State,NSC_EnrFirst_College_Code=`College.Code/Branch`)
  enrollment<-enrollment[,!names(enrollment)%in%c("Enrollment.Begin","Requester.Return.Field")]
  enrollment<-enrollment[!duplicated(enrollment[,c("FName","LName","BYear","BMonth","BDay","YEARAPAY")]),]
  Data<-merge(Data,enrollment,all.x=T)
  Data$NSC_Enr_College_Name<-as.character(Data$NSC_Enr_College_Name) ; Data$NSC_EnrFirst_College_Name<-as.character(Data$NSC_EnrFirst_College_Name)
  check<-(is.na(Data$NSC_EnrFirst_College_State)&NA_to_F(Data$NSC_Enr_FourYear)&!is.na(Data$NSC_Enr_College_State)) ; Data$NSC_EnrFirst_College_State[check]<-Data$NSC_Enr_College_State[check] ; Data$NSC_EnrFirst_College_Name[check]<-Data$NSC_Enr_College_Name[check] ; Data$NSC_EnrFirst_College_Code[check]<-Data$NSC_Enr_College_Code[check] #Fills in for few cases where student appears in following-January NSC file but not 2018 NSC file.
  Data$NSC_Enr_College_Name<-as.factor(Data$NSC_Enr_College_Name) ; Data$NSC_EnrFirst_College_Name<-as.factor(Data$NSC_EnrFirst_College_Name)
  rm(enrollment) ; gc()
  
  #Now find first enrollment overall, including community colleges
  enrollment<-Full_NSC[,c("Requester.Return.Field","FName","LName","BYear","BMonth","BDay","YEARAPAY","Enrollment.Begin","College.Name","College.State","2-year./.4-year","College.Code/Branch")]
  enrollment<-enrollment[(enrollment$Enrollment.Begin>enrollment$YEARAPAY*10000+701)&(enrollment$Enrollment.Begin<(enrollment$YEARAPAY+6)*10000+701),]
  enrollment$hi<-enrollment$Enrollment.Begin ; enrollment$hi<-enrollment$hi+NA_to_T(enrollment$`2-year./.4-year`==2)*600 #Keep four-year enrollment if within six months of first two-year enrollment
  enrollment<-enrollment[order(enrollment$hi),!names(enrollment)=="hi"] #Keep first enrollment
  enrollment<-enrollment[!duplicated(enrollment$Requester.Return.Field),] #Unique on ID
  enrollment<-rename(enrollment,NSC_EnrFirstIncCC_College_Name=College.Name,NSC_EnrFirstIncCC_College_State=College.State,NSC_EnrFirstIncCC_College_Code=`College.Code/Branch`)
  enrollment<-enrollment[,!names(enrollment)%in%c("Enrollment.Begin","Requester.Return.Field","2-year./.4-year")]
  enrollment<-enrollment[!duplicated(enrollment[,c("FName","LName","BYear","BMonth","BDay","YEARAPAY")]),]
  Data<-merge(Data,enrollment,all.x=T)
  Data$NSC_Enr_College_Name<-as.character(Data$NSC_Enr_College_Name) ; Data$NSC_EnrFirstIncCC_College_Name<-as.character(Data$NSC_EnrFirstIncCC_College_Name)
  check<-(is.na(Data$NSC_EnrFirstIncCC_College_State)&!is.na(Data$NSC_Enr_College_State)) ; Data$NSC_EnrFirstIncCC_College_State[check]<-Data$NSC_Enr_College_State[check] ; Data$NSC_EnrFirstIncCC_College_Name[check]<-Data$NSC_Enr_College_Name[check] ; Data$NSC_EnrFirstIncCC_College_Code[check]<-Data$NSC_Enr_College_Code[check] #Fills in for cases where student appears in following-January NSC file but not 2018 NSC file.
  Data$NSC_Enr_College_Name<-as.factor(Data$NSC_Enr_College_Name) ; Data$NSC_EnrFirstIncCC_College_Name<-as.factor(Data$NSC_EnrFirstIncCC_College_Name)
  rm(enrollment) ; gc()
  print("Finished enrollment")
  
  #Find institutions that earned degrees from
  graduate<-Full_NSC[NA_to_F(Full_NSC$Graduation.Date<((Full_NSC$YEARAPAY+9)*10000))&NA_to_F(Full_NSC$Graduation.Date>((Full_NSC$YEARAPAY+1)*10000+200)),] #Before January 1, nine years later, and after the following January (otherwise get AA's too early)
  graduate$NSC_Grad_Degree[grepl("BACHELOR|^B[. ]*[ASFMHBCON](?!SIC)|BACCALAUR|\\sBS",graduate$Degree.Title,perl=T)]<-"BA"
  graduate$NSC_Grad_Degree[grepl("ASSOC|^A[. ]*[AS]",graduate$Degree.Title)]<-"AA"
  graduate$NSC_Grad_Degree[grepl("MASTER|(^|/)M[. ]*[ASBEPFMH]|JURIS|JD|DOCTOR|^PHARM|^PD|^DPT|^EDS|^DPH",graduate$Degree.Title)]<-"MA"
  graduate$NSC_Grad_Degree[graduate$Degree.Title==""]<-"Unknown"
  graduate<-graduate[!is.na(graduate$NSC_Grad_Degree)&!grepl("CERT",graduate$Degree.Title),]
  graduate$num<-1:nrow(graduate) #Pick out a single degree
  temp<-ddply(graduate[,c("num","Requester.Return.Field","NSC_Grad_Degree")],.(Requester.Return.Field),function(x){
    for(d in c("BA","MA","AA","Unknown")){ #Care most about undergrad school; will take grad school if necessary
      if(sum(grepl(d,x$NSC_Grad_Degree))>0) return(x[which(grepl(d,x$NSC_Grad_Degree))[1],])
    }
  })
  graduate<-graduate[graduate$num%in%temp$num,!names(graduate)=="num"]
  graduate$Major<-paste(graduate$Degree.Major.1,graduate$Degree.Major.2,graduate$Degree.Major.3,graduate$Degree.Major.4,sep=";;")
  graduate$CIP<-paste(graduate$Degree.CIP.1,graduate$Degree.CIP.2,graduate$Degree.CIP.3,graduate$Degree.CIP.4,sep=";;")
  graduate<-rename(graduate,NSC_Grad_College_Name=College.Name,NSC_Grad_College_State=College.State,NSC_Grad_Major=Major,NSC_Grad_CIP=CIP,NSC_Grad_College_Code=`College.Code/Branch`)
  graduate$NSC_Grad_FourYear<-graduate$`2-year./.4-year`==4
  graduate$NSC_Grad_Public<-as.character(graduate$`Public./.Private`)=="Public"
  graduate$NSC_Grad_Year<-round(graduate$Graduation.Date/10000)
  graduate$NSC_Grad_Month<-as.integer(substr(graduate$Graduation.Date,5,6))
  graduate<-graduate[,grepl("NSC",names(graduate))|names(graduate)%in%c("FName","MInit","LName","BYear","BMonth","BDay","YEARAPAY")]
  graduate<-graduate[!duplicated(graduate[,c("FName","LName","BYear","BMonth","BDay","YEARAPAY")]),]
  Data<-merge(Data,graduate,by=c("FName","MInit","LName","BYear","BMonth","BDay","YEARAPAY"),all.x=T)
  rm(graduate)
  print("Finished graduation")
  
  #Correct college names using codes (adjusts for small typos)
  for(t in c("Enr","EnrFirst","EnrFirstIncCC","Grad")){
    Data[,paste0("NSC_",t,"_College_Name")]<-as.character(Data[,paste0("NSC_",t,"_College_Name")])
    hi<-Data[,c(paste0("NSC_",t,"_College_Name"),paste0("NSC_",t,"_College_Code"))] ; names(hi)<-c("NSCNAME","Code")
    hi1<-aggregate(NSCNAME~Code,hi,Mode)
    Data<-merge(Data,hi1,by.x=paste0("NSC_",t,"_College_Code"),by.y="Code",all.x=T)
    Data[,paste0("NSC_",t,"_College_Name")]<-Data$NSCNAME ; Data<-Data[,!names(Data)=="NSCNAME"]
  }
  
  ###
  #  Identify graduate degrees
  ###
  graduate<-Full_NSC[NA_to_F(Full_NSC$`Graduated?`=="Y"),]
  graduate$BA<-grepl("BACHELOR|^B[. ]*[ASFMHBCON]|BACCALAUR|\\sBS",graduate$Degree.Title)
  graduate$AA<-grepl("ASSOC|^A[. ]*[AS]",graduate$Degree.Title)
  graduate$MA<-grepl("MASTER|(^|/)M[. ]*[ASBEPFMH]|JURIS|JD|DOCTOR|^PHARM|^PD|^DPT|^EDS|^DPH|MBA",graduate$Degree.Title) #Maybe should go back and delete certificates?
  graduate$MD<-grepl("DOCTOR OF MEDICINE|MEDICINAE DOCTORIS",graduate$Degree.Title)&!grepl("VET",graduate$Degree.Title)
  graduate$JD<-grepl("JURIS DOCTOR|JD|DOCTOR OF JURISPRUDENCE|DOCTOR OF LAW",graduate$Degree.Title)
  graduate$MBA<-grepl("MASTER OF BUSINESS|MBA|MASTER BUS",graduate$Degree.Title)
  load("Data/Derived/NSC_STEM_fromCIP.Rda") #Identify STEM graduate degrees from CIP codes
  graduate$MA_STEM<-graduate$MA&graduate$Degree.Major.1%in%STEM
  graduate<-graduate[!(!graduate$BA&!graduate$AA&!graduate$MA&!graduate$MD&!graduate$JD&!graduate$MBA&!graduate$MA_STEM),]
  graduate<-merge(graduate,Data[,c("FName","MInit","LName","BYear","BMonth","BDay","YEARAPAY")])
  temp<-ddply(graduate[,c("Requester.Return.Field","BA","AA","MA","MD","JD","MBA","MA_STEM")],.(Requester.Return.Field),function(x) data.frame(BA=sum(x$BA)>0,AA=sum(x$AA)>0,MA=sum(x$MA)>0,MD=sum(x$MD)>0,JD=sum(x$JD)>0,MBA=sum(x$MBA)>0,MA_STEM=sum(x$MA_STEM)>0)) #Slow; takes 10 minutes
  graduate<-graduate[!duplicated(graduate[,c("FName","LName","BYear","BMonth","BDay","YEARAPAY")]),c("Requester.Return.Field","FName","MInit","LName","BYear","BMonth","BDay","YEARAPAY")] ; graduate<-merge(graduate,temp) ; graduate<-graduate[,!names(graduate)=="Requester.Return.Field"]
  Data<-merge(Data,graduate,all.x=T)
  for(v in c("BA","MA","AA","MD","JD","MBA","MA_STEM")){
    Data[is.na(Data[,v]),v]<-0
  }
  
  rm(Full_NSC,enrollment,gradsch,graduate,enrollment1,hi,hi1) ; gc()
  print("Finished NSC")
  
  
  
  
  ###
  #  Add additional information from secondary UC admissions databases.
  ###
  AppF<-read.csv(paste0(secure,"APPLICATION_F_201809121728.csv"))
  Term<-read.csv(paste0(secure,"TERM_D_201712061350.csv"))
    names(Term)[1]<-"T_KEY" ; Term<-rename(Term,YEARAPAY=T_ACAD_YR,CQUARTERAP=T_CD)
    Term$YEARAPAY<-as.numeric(gsub(",","",as.character(Term$YEARAPAY)))
  AppF<-merge(AppF,Term[,c("T_KEY","YEARAPAY","CQUARTERAP")],by="T_KEY",all.x=T)
    names(AppF)[2]<-"AL_KEY" ; AppF<-rename(AppF,ETSNO=APPL_ID)
  AppF$AL_KEY<-as.integer(gsub("[^0-9]","",AppF$AL_KEY))
  Data<-merge(Data,AppF[,c("ETSNO","YEARAPAY","CQUARTERAP","AL_KEY","PAR_EDU_RSDNCY_KEY","PAR_OCCUPN_KEY","APPL_SLFRPTED_FRSHMN_GPA_SCO")],by=c("ETSNO","YEARAPAY","CQUARTERAP"),all.x=T)
  rm(AppF) ; gc()
  
  AppDM<-data.frame(read_csv(paste0(secure,"UAD_DM_201902201437.csv")),stringsAsFactors = F) ; names(AppDM)<-gsub("[^A-Z0-9_]","",names(AppDM))
    AppDM<-AppDM[,grepl("^(APPLICATION_ID|ACADEMIC_YR|TERM_CD)$|FIRST_GEN|DSBLTY_CD|VETERAN_FL|HSCRSE_TYP_ACADCRSE_|HSCRSE_TYP_ACADHONRS_12|HSCRSE_TYP_TOT|ELGBLITY_.*DESC|RESIDENCY_STATUS_DESC|INTEND_FIELD",names(AppDM))]
  Data<-merge(Data,AppDM,by.x=c("ETSNO","YEARAPAY","CQUARTERAP"),by.y=c("APPLICATION_ID","ACADEMIC_YR","TERM_CD"),all.x=T)
  rm(AppDM) ; gc()
  
  load(paste0(secure,"ulong_phase2_allyearsp92_15_0818.Rda"))
    names(U)<-gsub("00$","",names(U))
    U<-U[U$LKEY!="",]
    names(U)<-toupper(names(U))
    U<-rename(U,GPA1=GPACUM1,GPA2=GPACUM2,GPA3PLUS=GRADGPA3FWD,GPA_Grad=GRADGPA)
    U<-U[!duplicated(U$LKEY),] #Not sure where these come from; very few
  Data<-merge(Data,U[,c("LKEY","HASGRAD2","HASGRAD3","HASGRAD4","HASGRAD5","HASGRAD6","HASGRAD7","HASGRAD8")],all.x=T) #Indicators for earning UC degree 2-8 years after initial UC enrollment
  rm(U) ; gc()
  
  U<-read_csv(paste0(secure,"ULONG_SHADOW_201809141443.csv"))
  Data<-merge(Data,U[!duplicated(U$LKEY),c("LKEY","CIP6RECENT")],all.x=T)
  rm(U) ; gc()
  
  load(paste0(secure,"SATII_Data.Rda"))
  Data<-merge(Data,SATII[,c("AL_KEY","SATII_W","SATII_M1","SATII_M2","SATII_1","SATII_Score_1","SATII_2","SATII_Score_2","SATII_3","SATII_Score_3")],all.x=T)
  rm(SATII) ; gc()
  
  
  ###
  #  Geolocate applicants' home addresses. This occurs in four parts below
  ###
  
  ##Part 1: Clean 1997-2005 addresses
  cleanAddresses<-function(add){
    road<-"(Ave?|Dr|L[na]|Blvd|St|Pl|Wy|Tr|Sq|Ct|Rd|Cir|Avenue|Drive|Place|Lane|Boulevard|Street|Way|Terrace|Square|Court|Road|Circle)"
    add<-gsub("[/.;]","",add)
    add<-gsub("^\\s*[#]*\\s*","",add)
    add<-gsub("([0-9]+)-[0-9]+","\\1",add)
    add<-gsub("([0-9]+)\\s+([0-9]+\\s)","\\1\\2 ",add) #Need the space; otherwise break 111 10th St.
    add<-gsub("^\\s*([0-9]+)([A-Z])","\\1 \\2",add)
    add<-gsub("(,\\s*)?((A[Pp][Tt]|U[Nn][Ii][Tt]|N[oO])\\s*[#]?(N[Oo]\\s*)?|[#])\\s*[ 0-9]*[A-z]?[ 0-9]*$","",add)
    ignore<-grepl("^[0-9 NSEW]+R[td]e?[0-9 ]+$",add,ignore.case=T) #Don't remove the number from 111 Rd. 13
    add[!ignore]<-gsub(paste0(road,"[0-9 ]*[A-z]?[0-9 ]*$"),"\\1",add[!ignore],ignore.case=T)
    add<-gsub(paste0("([0-9A-z])",road),"\\1 \\2",add) #Can't ignore case here
    add<-gsub("([0-9A-z])\\s*[#][0-9A-Z]+\\s","\\1 ",add)
    add[grepl("P.?O|B[Oo][Xx]\\s*[0-9]",add)]<-""
    add<-sapply(add,simpleCap)
    return(add)
  }
  Data$PERM_STREET<-cleanAddresses(Data$PERM_STREET)
  fixZip<-(NA_to_F(Data$PERM_ZIP>99999999)&NA_to_F(Data$PERM_ZIP<1000000000))
    Data$PERM_ZIP[fixZip]<-floor(Data$PERM_ZIP[fixZip]/10000)
  if(0==1){ #Produce address file to be geolocated using ArcGIS. Only done once.
    addresses<-Data[,c("PERM_STREET","PERM_CITY","PERM_STATE","PERM_ZIP","PERM_COUNTRY")]
    addresses<-addresses[addresses$PERM_COUNTRY%in%c("United States","")&!(is.na(addresses$PERM_ZIP)&addresses$PERM_STREET==""),]
    addresses<-addresses[!duplicated(addresses)&addresses$PERM_STREET!="",]
    write.csv(addresses,file=paste0(secure,"uadm_data_addresses.csv"))
  }
    
  ##Part 2: Clean 1994-1996 addresses and produce address file to be geolocated using ArcGIS.
  if(0==1){ #Only done once
    AppD<-read_csv(paste0(secure,"APPLICANT_D_201712061100.csv"))
    AppD<-AppD[gsub(",","",AppD$AL_KEY)%in%Data$AL_KEY[Data$YEARAPAY%in%1994:1996],]
    AppD$PERM_STREET<-AppD$AL_CUR_ADDR_LN1_TXT
    AppD$Line2<-AppD$AL_CUR_ADDR_LN2_TXT
    AppD$Line2<-gsub("\\s*(US)?\\s*$","",AppD$Line2)
    AppD<-AppD[grepl("CA$",AppD$Line2),]
    AppD$PERM_CITY<-gsub("[, ]*CA$","",AppD$Line2)
    AppD$PERM_STATE<-"California"
    AppD$PERM_COUNTRY<-"United States"
    AppD$PERM_ZIP<-AppD$AL_CUR_ADDR_PSTL_CD
    
    AppD$PERM_STREET1<-cleanAddresses(AppD$PERM_STREET) #Noisier than later addresses; I geolocate both
    hi<-AppD[AppD$PERM_STREET1!="",c("AL_KEY","PERM_STREET1","PERM_CITY","PERM_STATE","PERM_ZIP","PERM_COUNTRY")] ; names(hi)[2]<-"PERM_STREET"
    hi<-rbind(hi,AppD[AppD$PERM_STREET!="",c("AL_KEY","PERM_STREET","PERM_CITY","PERM_STATE","PERM_ZIP","PERM_COUNTRY")])
    
    hi$AL_KEY<-gsub(",","",hi$AL_KEY)
    write.csv(hi,file=paste0(secure,"uadm_data_addressesPre1996.csv"))
    rm(AppD,hi) ; gc()
  }
    
  ##Part 3: Load 1997-2005 Lat-Longs from ArcGIS and geolocate into Census blocks
  blocks <- readShapeSpatial("Data/Raw/Census_Blocks/tl_2016_06_tabblock10.shp")
  Geolocate_Addresses<-function(addresses_done,blocks){
    #Geolocate
    addresses_CA<-addresses_done[addresses_done$PERM_STATE%in%c("CA","California")&!is.na(addresses_done$Longitude),]
    addresses_CA<-addresses_CA[!duplicated(addresses_CA),]
    latlong<-addresses_CA[,c("Longitude","Latitude")]
    latlong<-latlong[!is.na(latlong$Longitude)&!is.na(latlong$Latitude),]
    latlong<-SpatialPoints(latlong)
    Results<-addresses_CA[!is.na(addresses_CA$Longitude)&!is.na(addresses_CA$Latitude),]
    #Geolocate by Census Tract, Block (CA ONLY)
    results <- over(latlong, blocks)
    Results<-cbind(Results,results)
    Results<-Results[,names(Results)%in%c("PERM_STREET","PERM_CITY","PERM_ZIP","PERM_STATE","PERM_COUNTRY","Longitude","Latitude","GEOID10","AL_KEY")]
    addresses_done<-merge(addresses_done,Results,all.x=T)
    rm(results,Results,addresses_CA,latlong) ; gc()
    
    addresses_done$PERM_STATE[is.na(addresses_done$PERM_STATE)]<-"" #Only occurs for a few cases, e.g. St. Thomas
    for(var in c("PERM_STREET","PERM_CITY","PERM_STATE","PERM_COUNTRY")){
      addresses_done[addresses_done[,var]==" ",var]<-""
    }
    addresses_done<-addresses_done[!duplicated(addresses_done),]
    return(addresses_done)
  }
  add1<-read.csv(paste0(secure,"Addresses1.csv"),header=FALSE,sep=";",quote = "",stringsAsFactors = F) #Had to split into 4 for ARCGIS
  add2<-read.csv(paste0(secure,"Addresses2.csv"),header=FALSE,sep=";",quote = "",stringsAsFactors = F)
  add3<-read.csv(paste0(secure,"Addresses3.csv"),header=FALSE,sep=";",quote = "",stringsAsFactors = F)
  add4<-read.csv(paste0(secure,"Addresses4.csv"),header=FALSE,sep=";",quote = "",stringsAsFactors = F)
  addresses_done<-rbind(add1,add2,add3,add4)
    rm(add1,add2,add2017,add3,add4)
    names(addresses_done)<-c("Longitude","Latitude","PERM_STREET","PERM_CITY","PERM_STATE","PERM_ZIP","PERM_COUNTRY")
    addresses_done<-addresses_done[addresses_done$PERM_STREET!="",]
    addresses_done<-addresses_done[!duplicated(addresses_done),]
    addresses_done$PERM_CITY<-gsub("-","q",sapply(addresses_done$PERM_CITY,simpleCap))
    for(var in c("Longitude","Latitude")){
      addresses_done[,var]<-as.numeric(addresses_done[,var])
    }
  addresses_done<-Geolocate_Addresses(addresses_done,blocks)
    addresses_done<-addresses_done[,c("PERM_STREET","PERM_CITY","PERM_ZIP","PERM_STATE","PERM_COUNTRY","Longitude","Latitude","GEOID10")]
    Data$PERM_CITY<-gsub("-","q",sapply(Data$PERM_CITY,simpleCap))
    Data$PERM_STATE[is.na(Data$PERM_STATE)]<-""
  Data<-merge(Data,addresses_done,by=c("PERM_STREET","PERM_CITY","PERM_ZIP","PERM_STATE","PERM_COUNTRY"),all.x=T)
  
  ##Part 4: Load 1994-1996 Lat-Longs from ArcGIS and geolocate into Census blocks
  addresses_done<-read.csv(paste0(secure,"uadm_data_addressesPre1996_processed.csv"),stringsAsFactors = F)
    addresses_done<-addresses_done[addresses_done$Loc_name=="Street_Address",]
    addresses_done<-addresses_done[!duplicated(addresses_done$AL_KEY),c("AL_KEY","X","Y","PERM_STREET","PERM_CITY","PERM_STATE","PERM_COUNTRY","PERM_ZIP")]
    names(addresses_done)<-c("AL_KEY","Longitude","Latitude","PERM_STREET","PERM_CITY","PERM_STATE","PERM_ZIP","PERM_COUNTRY")
  addresses_done<-Geolocate_Addresses(addresses_done,blocks)
    addresses_done<-addresses_done[,c("AL_KEY","PERM_STREET","PERM_CITY","PERM_ZIP","PERM_STATE","PERM_COUNTRY","Longitude","Latitude","GEOID10")]
    Data$GEOID10<-as.character(Data$GEOID10)
    data<-Data[Data$YEARAPAY<1997,]
    data<-data[,!names(data)%in%c("Longitude","Latitude","PERM_STREET","PERM_CITY","PERM_ZIP","PERM_STATE","PERM_COUNTRY","GEOID10")]
  data<-merge(data,addresses_done,all.x=T)
  Data<-rbind(Data[!Data$YEARAPAY<1997,],data)
  rm(blocks) ; gc()
  
  
  ###
  #  Merge in EDD wage data
  ###
  load(paste0(secure,"EDD_Processed.Rda"))
    EDD<-EDD[EDD$AL_KEY%in%Data$AL_KEY&!is.na(EDD$AL_KEY),]
    hi<-Data[,c("AL_KEY","YEARAPAY")] ; hi<-hi[!duplicated(hi),] ; EDD<-merge(EDD,hi)
  for(i in seq(6,16)){
    EDD_merge<-EDD[EDD$Year==(EDD$YEARAPAY+i),c("AL_KEY","YEARAPAY","wage_sum")]
    names(EDD_merge)<-gsub("([^A-Z])$",paste0("\\1_",i),names(EDD_merge))
    Data<-merge(Data,EDD_merge,all.x=T)
    print(i) ; gc()
  }
  rm(EDD,EDD_merge) ; gc()
  
  #Further data cleaning
  for(v in c("APP01","APP03","APP04","APP05","APP06","APP07","APP08","APP09","APP10","SATRR","SATRM","SATIV","SATIM","SATIIW")){
    Data[,v]<-as.integer(Data[,v])
  }
  for(v in c("income_parent")){
    Data[,v]<-as.numeric(Data[,v])
  }
  for(v in names(Data)[grepl("CMAJPRP",names(Data))]){ #Add leading 0's if missing
    Data[,v]<-as.character(Data[,v])
    Data[is.na(Data[,v])&(v!="CMAJPRP10"),v]<-""
    for(zzz in 1:2) Data[nchar(Data[,v])%in%c(1,2),v]<-paste0("0",Data[nchar(Data[,v])%in%c(1,2),v])
  }
  
  save(Data,file=paste0(secure_derived,"uadm_data_extract_processed.Rda"))
  gc()
}









if(Construct_CollegeBoard_Data){
  ###
  #  Part 1: Clean College Board Data
  ###
  
  S<-data.frame()
  load(paste0(secure_cb,"SAS/cb1994.R")) ; S<-SAT[,!names(SAT)%in%"SSNSAT"] ; S$Year<-1994
  for(y in 1995:2010){
    load(paste0(secure_cb,"SAS/cb",y,".R"))
    SAT$Year<-y
    SAT<-SAT[,names(SAT)%in%names(S)]
    S<-rbind.fill(S,SAT)
    print(y)
  }
  S$LName<-tolower(S$LASTSAT)
  S$MInit<-tolower(S$MISAT)
  S$FName<-tolower(S$FIRSTSAT)
  S$BMonth<-as.integer(substr(S$DOBSAT,1,2))
  S$BDay<-as.integer(substr(S$DOBSAT,3,4))
  S$BYear<-as.integer(substr(S$DOBSAT,5,6)) ; S$BYear<-S$BYear+1900+100*(S$BYear<10)
  S$CPREVSCH<-as.integer(S$ATPCODE_CB) #What's AI_CODE?
  S$GradYear<-as.integer(substr(S$GRADSAT,3,4)) ; S$GradYear<-S$GradYear+1900+100*(S$GradYear<30)+(as.integer(substr(S$GRADSAT,1,2))>8)
  S$Pref_ClosetoHome<-S$WCHOME
  S$Pref_InState<-S$WCHMSTAT
  S$Pref_NearState<-S$WCNRSTA
  S$Pref_FarState<-S$WCOUTUS
  S$Ethnicity<-S$ETH7_CB
  S$Sex<-S$SEXSAT
  S$Year<-as.integer(substr(S$IDCB,1,4))
  S<-rename(S,Address=STRTSAT,City=CITYSAT,State=STATESAT,Zipcode=ZIPSAT,hsgpa_cb=RCGAPSDQ,income_parent_cb=INCOME_CB,Date_SAT=LATSTSAT,Rank_cb=CLASRANK,Grade_cb=GRDESAT)
  S<-S[,c("IDCB","LName","MInit","FName","BDay","BMonth","BYear","Ethnicity","ETHNSDQ","RCETHSDQ","Sex","GradYear","CPREVSCH","Address","City","State","Zipcode","hsgpa_cb","income_parent_cb","MOTHEDUC","FATHEDUC","SATIV_CB","SATIM_CB","SATIIW_CB","SATIIM1_CB","SATIIM2_CB","SATIIM_CB","SATIIT_CB","TYPIIT_CB","Year","Date_SAT","Rank_cb","Grade_cb",names(S)[grepl("^DI[0-9]*$",names(S))],"Pref_ClosetoHome","Pref_InState","Pref_NearState","Pref_FarState")]
  S<-S[!is.na(S$CPREVSCH),] #Shouldn't be missing; means only took PSAT/AP
  gc()
  
  #UC Sends
  ns<-c(4833,4834,4837,4839,4836,4860,4835,4859,4129) #https://collegereadiness.collegeboard.org/pdf/sat-domestic-code-list.pdf
  cs<-c("01","03","04","05","06","07","08","09","10")
  for(c in 1:9){
    S[,paste0("Send",cs[c])]<-0
    for(i in 1:30){
      z<-i ; if(i<10) z<-paste0(0,i)
      S[NA_to_F(S[,paste0("DI",z)]==ns[c]),paste0("Send",cs[c])]<-1
    }
  }
  S$SendUC<-S$Send01==1|S$Send03==1|S$Send04==1|S$Send05==1|S$Send06==1|S$Send07==1|S$Send08==1|S$Send09==1|S$Send10==1
  
  HS<-read.csv("Data/Derived/CA_HS_CrossWalk.csv")
  for(n in c("County","District","School")) HS[,paste0("HS_",n)]<-HS[,n]
  HS$HS_Name<-HS$SCHOOL_NAME
  HS$HS_SDE_Code<-HS$SDE_CODE
  S<-merge(S,HS[,grepl("HS_|CPREVSCH",names(HS))],by="CPREVSCH",all.x=T,sort=F)
  
  save(S,file=paste0(secure_cb,"SATUniverse.Rda"))
  
  ###
  #  Part 2: Link to UC applicant records
  ###
  S$FInit<-substr(S$FName,1,1)
  S$YEARAPAY<-S$Year
  load(paste0(secure,"uadm_data_extract_processed.Rda"))
  Data<-Data[Data$SCHTYPE%in%c("A")&!is.na(Data$SAT_M)&!is.na(Data$SAT_R),]
  for(v in c("LName","MInit","FName")) Data[,v]<-tolower(Data[,v])
  Data$FInit<-substr(Data$FName,1,1)
  
  #Start by matching on everything: name, birth date, year, and high school.
  S<-S[!allduplicated(S[,c("LName","MInit","FName","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")]),] #3 people 1994-2005.
  Data<-merge(Data,S[,c("IDCB","LName","MInit","FName","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")],all.x=T)
  Final<-Data[!is.na(Data$IDCB),]
  Data<-Data[is.na(Data$IDCB),!names(Data)=="IDCB"]
  S<-S[!S$IDCB%in%Final$IDCB,]
  Final$Quality<-1
  
  #Drop MInit.
  S<-S[!allduplicated(S[,c("LName","FName","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")]),] #MName has 10 duplicates
  Data<-merge(Data,S[,c("IDCB","LName","FName","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")],all.x=T) 
  Final<-rbind.fill(Final,Data[!is.na(Data$IDCB),])
  Data<-Data[is.na(Data$IDCB),!names(Data)=="IDCB"]
  S<-S[!S$IDCB%in%Final$IDCB,]
  
  #Add MInit and drop CPREVSCH.
  S<-S[!allduplicated(S[,c("LName","MInit","FName","BDay","BMonth","BYear","YEARAPAY")]),] #300 duplicates
  Data<-merge(Data,S[,c("IDCB","LName","MInit","FName","BDay","BMonth","BYear","YEARAPAY")],all.x=T) 
  Final<-rbind.fill(Final,Data[!is.na(Data$IDCB),])
  Data<-Data[is.na(Data$IDCB),!names(Data)=="IDCB"]
  S<-S[!S$IDCB%in%Final$IDCB,]
  
  #Add CPREVSCH and drop birthday.
  S<-S[!allduplicated(S[,c("LName","MInit","FName","CPREVSCH","YEARAPAY")]),] #1,200 duplicates
  Data<-merge(Data,S[,c("IDCB","LName","MInit","FName","CPREVSCH","YEARAPAY")],all.x=T) 
  Final<-rbind.fill(Final,Data[!is.na(Data$IDCB),])
  Data<-Data[is.na(Data$IDCB),!names(Data)=="IDCB"]
  S<-S[!S$IDCB%in%Final$IDCB,] 
  
  #Add birthday and reduce to FInit, still with MInit.
  S<-S[!allduplicated(S[,c("LName","MInit","FInit","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")]),] #1,800 duplicates
  Data<-merge(Data,S[,c("IDCB","LName","MInit","FInit","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")],all.x=T) 
  Final<-rbind.fill(Final,Data[!is.na(Data$IDCB),])
  Data<-Data[is.na(Data$IDCB),!names(Data)=="IDCB"]
  S<-S[!S$IDCB%in%Final$IDCB,] 
  
  #Now drop MInit.
  S<-S[!allduplicated(S[,c("LName","FInit","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")]),] #1,300 duplicates
  Data<-merge(Data,S[,c("IDCB","LName","FInit","BDay","BMonth","BYear","CPREVSCH","YEARAPAY")],all.x=T) 
  Final<-rbind.fill(Final,Data[!is.na(Data$IDCB),])
  Data<-Data[is.na(Data$IDCB),!names(Data)=="IDCB"]
  S<-S[!S$IDCB%in%Final$IDCB,] 
  Final$Quality[is.na(Final$Quality)]<-2
  
  SAT_IDs<-Final[,c("AL_KEY","IDCB","Quality")]
  save(SAT_IDs,file=paste0(secure_cb,"SAT_IDs.Rda"))
}












if(Construct_Value_Added_Data){
  load(paste0(secure_derived,"uadm_data_extract_processed.Rda"))
    A<-Data[Data$SCHTYPE%in%c("A","B"),] #Freshman applicants
  
  A$income_parent<-winsor(A$income_parent,0.01) #Top-coded
  A$income_parent_missing<-is.na(A$income_parent)
  A$logincome_parent<-log(1+A$income_parent) ; A$logincome_parent[is.na(A$income_parent)]<-0
  A$income_parent_zeros<-A$income_parent/1000 ; A$income_parent_zeros[is.na(A$income_parent)]<-0
  A$UnivGrad_SixYears_NSCOnly<-as.integer(NA_to_F(A$NSC_Grad_FourYear==1&A$NSC_Grad_Year<=A$YEARAPAY+6&!((A$NSC_Grad_Year==A$YEARAPAY+6) & (A$NSC_Grad_Month>6))))
  A$CETHNICA_Cat[A$CETHNICA%in%c("P","5. White")]<-"White" #NOTE: spelled-out ethnicities are from 2017
  A$CETHNICA_Cat[A$CETHNICA%in%c("B","1. African American")]<-"Black"
  A$CETHNICA_Cat[A$CETHNICA%in%c("A","K","M","2. American Indian","7. International")]<-"Other"
  A$CETHNICA_Cat[A$CETHNICA%in%c("C","J","3. Hispanic/Latino")]<-"Hispanic"
  A$CETHNICA_Cat[A$CETHNICA%in%c("D","F","G","H","L","N","V","4. Asian/Pacific Islander")]<-"Asian"
  A$CETHNICA_Cat[A$CETHNICA%in%c("E","","6. Unknown")]<-"Decline"
  
  A$DK1<-paste(A$ADM01,A$ADM03,A$ADM04,A$ADM05,A$ADM06,A$ADM07,A$ADM08,A$ADM09) #Three possibilities for each campus: admitted, not admitted, and didn't apply
  
  for(v in names(A)[grepl("College_Code$",names(A))]) A[,v]<-as.character(as.integer(gsub(".{3}$","",A[,v])))
  A$NSC_EnrFirstIncCC_College_Code[is.na(A$NSC_EnrFirstIncCC_College_Code)]<-""
  
  A$NSC_EnrFirstIncCC_College_Code_Eth<-paste(A$NSC_EnrFirstIncCC_College_Code,"-",A$CETHNICA_Cat) ; A$NSC_EnrFirstIncCC_College_Code_Eth[A$CETHNICA_Cat%in%c("Decline")]<-NA

  load(paste0(secure_cb,"SAT_IDs.Rda"))
    A<-merge(A,SAT_IDs,all.x=T)
  
  A_AA<-A[A$YEARAPAY%in%1995:1997,]
    for(v in names(A)[grepl("wage_sum_1[2-6]",names(A))]){ ; A_AA[,paste0(v,"_Uncondit")]<-winsor(A_AA[,v],0.01) ; A_AA[is.na(A_AA[,paste0(v,"_Uncondit")]),paste0(v,"_Uncondit")]<-0 ; }
    A_AA$Total_Wages30s_NumYears<-apply(A_AA[,grepl("wage_sum_(1[23456])_Uncondit",names(A_AA))],1,function(x) sum(x>0))
    A_AA$Total_Wages30s_Condit<-apply(A_AA[,grepl("wage_sum_(1[23456])_Uncondit",names(A_AA))],1,function(x) sum(x))/A_AA$Total_Wages30s_NumYears
    A_AA$WageVA<-A_AA$Total_Wages30s_Condit
    A_AA$GradVA<-(A_AA$UnivGrad_SixYears_NSCOnly==1)|NA_to_F(A_AA$HASGRAD6==1)

  load(paste0(secure_cb,"SATUniverse.Rda")) ; SAT<-S
    S_AA<-merge(A_AA,SAT[,c("IDCB",names(SAT)[grepl("^DI",names(SAT))])])
    S_AA$DK_CB<-as.factor(apply(S_AA[,grepl("^DI",names(S_AA))],1,function(x) paste(sort(x),collapse=" ")))
    S_AA<-S_AA[!S_AA$DK_CB%in%names(table(S_AA$DK_CB)[table(S_AA$DK_CB)==1])&S_AA$DK_CB!="",] #Throw out a lot of the sample because applicant set is unique
  rm(SAT) ; gc()
  
  ###
  #  Generate FEs without heterogeneity
  ###
  FEs<-A[!duplicated(A_AA$NSC_EnrFirstIncCC_College_Code),c("NSC_EnrFirstIncCC_College_Code","NSC_EnrFirstIncCC_College_Name")]
  FEs<-rbind.fill(FEs,A_AA[!duplicated(A_AA$NSC_EnrFirstIncCC_College_Code_Eth)&!is.na(A_AA$CETHNICA_Cat)&!is.na(A_AA$NSC_EnrFirstIncCC_College_Code_Eth),c("NSC_EnrFirstIncCC_College_Name","NSC_EnrFirstIncCC_College_Code_Eth","CETHNICA_Cat")])
  
  types<-list(c("_Raw","factor(YEARAPAY)|factor(IndepVar)"),
              c("_Mountjoy","factor(YEARAPAY)|factor(IndepVar)+factor(DK1)"),
              c("_DKCB","factor(YEARAPAY)|factor(IndepVar)+factor(DK_CB)"),
              c("_Chetty","income_parent_missing+poly(income_parent_zeros,5)+poly(satact,5)|IndepVar+YEARAPAY+CETHNICA")
  )
  for(IndepVar in c("NSC_EnrFirstIncCC_College_Code","NSC_EnrFirstIncCC_College_Code_Eth")){
    for(s in c("_AA")){
      for(t in types){
        for(v in c("GradVA","WageVA","hsgpa")){
          data<-get(paste0("A",s)) ; if(t[1]=="_DKCB") data<-get(paste0("S",s))
          data<-data[!is.na(data[,v]),]
          data$IndepVar<-data[,IndepVar] ; data<-data[!is.na(data$IndepVar),]
          for(zzz in c("satact","hsgpa")) if(grepl(zzz,t[2])) data<-data[!is.na(data[,zzz]),] ;
          eval(parse(text=paste0("reg<-felm(",v,"~",t[2],",data[data$IndepVar%in%names(table(data$IndepVar))[table(data$IndepVar)>=",(100-50*(IndepVar=="NSC_EnrFirstIncCC_College_Code_Eth"))/(1+(s=="_AA")),"],])")))
          temp<-getfe(reg,se = T)
          temp<-temp[grepl("IndepVar",temp$fe),c("effect","se","obs","idx")]
          names(temp)<-c(paste0(c("FE_","FE_se_","FE_obs_"),v,t[1],s),IndepVar)
          FEs<-rbind.fill(FEs[is.na(FEs[,IndepVar]),]
                          ,merge(FEs[!is.na(FEs[,IndepVar]),!names(FEs)%in%names(temp)[1:3]],temp,all.x=T))
          print(paste(Sys.time(),IndepVar,s,t[[1]],v))
        }
      }
    }
  }
  FEs<-FEs[!(duplicated(FEs$NSC_EnrFirstIncCC_College_Code)&!is.na(FEs$NSC_EnrFirstIncCC_College_Code)),]

  save(FEs,file=paste0(secure_derived,"NSC_ValueAdded_Measures.Rda"))
}










if(Construct_ACS_Data){
  ###
  #  Part 1: Clean ACS data (from IPUMS)
  ###
  ddi <- read_ipums_ddi("Data/Raw/usa_00072.xml")
  data <- read_ipums_micro(ddi)
    data$RACE[data$HISPAN!=0]<-99
  
  data$SERIAL<-as.integer(data$SERIAL)
  if(0==1){ #Too time-consuming to run every time
    hhinc<-data %>% group_by(SERIAL, YEAR) %>% summarise(HHINC = sum(INCTOT))
    save(hhinc,file="Data/Derived/HHInc_usa00072.Rda")
  } else load("Data/Derived/HHInc_usa00072.Rda")
  data<-merge(data,hhinc,all.x=T) ; gc()
  
  #CPI-adjust to 2018
  CPI<-read.csv("Data/Raw/CPI_California.csv") ; names(CPI)[1]<-"YEAR"
  data<-merge(data,CPI[,c(1:2)]) ; gc()
  for(v in names(data)[grepl("^(HH)?(INC)",names(data))]){
    data[,v]<-data[,v]/(data$CPI/CPI$CPI[CPI$YEAR==2018])
  }
  
  data$White<-data$RACE==1
  data$URM<-data$RACE%in%c(2,3,99)
  stem<-c(11,13,21,24,25,36,37,38,50,51,57,58,59,61) ; data$STEM<-NA_to_F(data$DEGFIELD%in%stem)|NA_to_F(data$DEGFIELD2%in%stem) ; data$STEM_Condit[data$EDUCD>=101]<-data$STEM[data$EDUCD>=101]
  data$Eth[data$RACE==1]<-"White"
  data$Eth[data$RACE%in%c(4,5,6)]<-"Asian"
  data$Eth[data$RACE%in%c(2,3,99)]<-"URM"
  data$K100<-data$INCWAGE>=100000 ; data$K150<-data$INCWAGE>=150000
  for(i in c(25,30,35,40)) data$AgeGroup[data$AGE%in%((i+2*(i==35)):(i+4-2*(i==25)))]<-i
  data$CA<-data$STATEFIP==6
  data$FromCA<-data$BPL==6 ; data$FromCA[data$BPL>99]<-NA
  data$BState<-data$BPL ; data$BState[data$BState>100]<-NA
  data$GradSch<-data$EDUCD>101
  data$loginc<-log(1+data$INCWAGE)
  
  save(data,file="Data/Derived/ACS_Data_Clean.Rda")
  
  ###
  #  Part 2: Generate distributions, for baseline wage analysis
  ###
  data$YBirth<-data$YEAR-data$AGE
  D<-data[data$YBirth%in%1974:1978&data$CA&data$EDUCD>100&data$INCWAGE>0,]
  cuts<-data.frame()
  for(y in 2001:2017){
    for(e in c("Asian","URM","White")){
      z<-weighted.quantile(D$INCWAGE[D$YEAR==y&D$Eth==e],D$PERWT[D$YEAR==y&D$Eth==e],seq(0,1,0.01))
      cuts<-rbind(cuts,data.frame(Year=rep(y,101),Eth=e,Quant=0:100,Val=z))
    }
  }
  save(cuts,file="Data/Derived/Income_Quants_AA.Rda")
}



stopCluster(cl)
